Imports

Load data

pycombat_batch_corrected_counts_path = "data/batch_corrected_data/pycombat_batch_corrected_counts.tsv"
metadata_path = "data/formatted_rnaseq_data/metadata.tsv"
pycombat_batch_corrected_dex_results_path = "data/differential_analysis/pycombat_batch_corrected_dex_results.tsv"
pycombat_batch_corrected_dex_selected_genes_path = "data/differential_analysis/pycombat_batch_corrected_dex_selected_genes.tsv"
pycombat_batch_corrected_counts = as.data.frame(read.table(
  pycombat_batch_corrected_counts_path,
  sep = "\t",
  header = TRUE,
  row.names = 1
))

print(pycombat_batch_corrected_counts)

metadata = as.data.frame(read.table(
  metadata_path,
  sep = "\t",
  header = TRUE,
  row.names = 1
))

print(metadata)

DESeq

Setup dds

design = ~ type + dataset_name

dds = DESeqDataSetFromMatrix(
  countData = pycombat_batch_corrected_counts,
  colData = metadata,
  design = design)
Warning: some variables in design formula are characters, converting to factors

Run deseq

dds <- DESeq(dds, test="LRT", reduced = ~ dataset_name)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 2239 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
save(dds, file = paste0("data/differential_analysis/pycombat_batch_corrected_counts_dds.RData"))

Examine results

res = as.data.frame(results(dds))

# Format results
res = as.data.frame(res)
res = res %>% 
  relocate(padj) %>%
  relocate(pvalue)

print(res)

res_path = paste0(pycombat_batch_corrected_dex_results_path)
write.table(res, file = res_path, sep = "\t")

res$abs_l2fc = abs(res$log2FoldChange)
print(res)

Select features

Visualize spread of l2fc values to determine a sensible cutoff for feature selection. Note that l2fc does not have a direct meaning since we’re doing an LRT test.

# Calculate the cumulative count
res$abs_l2fc = abs(res$log2FoldChange)
sorted_abs_l2fc <- sort(res$abs_l2fc)
cumulative_counts <- cumsum(table(sorted_abs_l2fc))

# Plot the cumulative counts
plot(names(cumulative_counts), cumulative_counts, 
     type="s",  # Step plot
     main="Cumulative Count Plot for L2FC of genes",
     xlab="L2FC Value", 
     ylab="Cumulative Count",
     col="blue")

cutoff = 2 # Chosen from looking at the plot
# Add a horizontal line at cutoff
abline(v = cutoff, col="red", lty=2)


selected_genes = res[res$abs_l2fc > cutoff,]
print(selected_genes)

selected_genes_path = paste0(pycombat_batch_corrected_dex_selected_genes_path)
write.table(selected_genes, file = selected_genes_path, sep = "\t")


# # Identify inflection point
# x_values <- as.numeric(names(cumulative_counts))
# y_values <- as.numeric(cumulative_counts)
# 
# # Approximate first and second derivatives
# first_derivative <- diff(y_values)
# second_derivative <- diff(first_derivative)
# 
# inflection_point_index <- which(diff(sign(second_derivative)) != 0) + 1
# inflection_point <- x_values[inflection_point_index]
# 
# # Add the inflection point to the plot
# points(inflection_point, y_values[inflection_point_index], col="red", pch=19)
# abline(v = inflection_point, col="red", lty=2)
# 
# # Print the inflection point
# print(paste("Inflection point at x =", inflection_point))
LS0tCnRpdGxlOiAiRGlmZmVyZW50aWFsIGFuYWx5c2lzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBJbXBvcnRzCgpgYGB7ciBsb2FkIHBhY2thZ2VzLCBpbmNsdWRlPUZBTFNFfQpsaWJyYXJ5KERFU2VxMikKbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKIyMgTG9hZCBkYXRhCgpgYGB7cn0KcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50c19wYXRoID0gImRhdGEvYmF0Y2hfY29ycmVjdGVkX2RhdGEvcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50cy50c3YiCm1ldGFkYXRhX3BhdGggPSAiZGF0YS9mb3JtYXR0ZWRfcm5hc2VxX2RhdGEvbWV0YWRhdGEudHN2IgpweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3Jlc3VsdHNfcGF0aCA9ICJkYXRhL2RpZmZlcmVudGlhbF9hbmFseXNpcy9weWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3Jlc3VsdHMudHN2IgpweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3NlbGVjdGVkX2dlbmVzX3BhdGggPSAiZGF0YS9kaWZmZXJlbnRpYWxfYW5hbHlzaXMvcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2RleF9zZWxlY3RlZF9nZW5lcy50c3YiCmBgYAoKYGBge3J9CnB5Y29tYmF0X2JhdGNoX2NvcnJlY3RlZF9jb3VudHMgPSBhcy5kYXRhLmZyYW1lKHJlYWQudGFibGUoCiAgcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50c19wYXRoLAogIHNlcCA9ICJcdCIsCiAgaGVhZGVyID0gVFJVRSwKICByb3cubmFtZXMgPSAxCikpCgpwcmludChweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfY291bnRzKQoKbWV0YWRhdGEgPSBhcy5kYXRhLmZyYW1lKHJlYWQudGFibGUoCiAgbWV0YWRhdGFfcGF0aCwKICBzZXAgPSAiXHQiLAogIGhlYWRlciA9IFRSVUUsCiAgcm93Lm5hbWVzID0gMQopKQoKcHJpbnQobWV0YWRhdGEpCmBgYAoKIyMgREVTZXEKCiMjIyBTZXR1cCBkZHMKCmBgYHtyfQpkZXNpZ24gPSB+IHR5cGUgKyBkYXRhc2V0X25hbWUKCmRkcyA9IERFU2VxRGF0YVNldEZyb21NYXRyaXgoCiAgY291bnREYXRhID0gcHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2NvdW50cywKICBjb2xEYXRhID0gbWV0YWRhdGEsCiAgZGVzaWduID0gZGVzaWduKQpgYGAKCiMjIyBSdW4gZGVzZXEKCmBgYHtyfQpkZHMgPC0gREVTZXEoZGRzLCB0ZXN0PSJMUlQiLCByZWR1Y2VkID0gfiBkYXRhc2V0X25hbWUpCnNhdmUoZGRzLCBmaWxlID0gcGFzdGUwKCJkYXRhL2RpZmZlcmVudGlhbF9hbmFseXNpcy9weWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfY291bnRzX2Rkcy5SRGF0YSIpKQpgYGAKIyMjIEV4YW1pbmUgcmVzdWx0cwoKYGBge3J9CnJlcyA9IGFzLmRhdGEuZnJhbWUocmVzdWx0cyhkZHMpKQoKIyBGb3JtYXQgcmVzdWx0cwpyZXMgPSBhcy5kYXRhLmZyYW1lKHJlcykKcmVzID0gcmVzICU+JSAKICByZWxvY2F0ZShwYWRqKSAlPiUKICByZWxvY2F0ZShwdmFsdWUpCgpwcmludChyZXMpCgpyZXNfcGF0aCA9IHBhc3RlMChweWNvbWJhdF9iYXRjaF9jb3JyZWN0ZWRfZGV4X3Jlc3VsdHNfcGF0aCkKd3JpdGUudGFibGUocmVzLCBmaWxlID0gcmVzX3BhdGgsIHNlcCA9ICJcdCIpCmBgYAojIyBTZWxlY3QgZmVhdHVyZXMKClZpc3VhbGl6ZSBzcHJlYWQgb2YgbDJmYyB2YWx1ZXMgdG8gZGV0ZXJtaW5lIGEgc2Vuc2libGUgY3V0b2ZmIGZvciBmZWF0dXJlIHNlbGVjdGlvbi4gTm90ZSB0aGF0IGwyZmMgZG9lcyBub3QgaGF2ZSBhIGRpcmVjdCBtZWFuaW5nIHNpbmNlIHdlJ3JlIGRvaW5nIGFuIExSVCB0ZXN0LgoKYGBge3J9CiMgQ2FsY3VsYXRlIHRoZSBjdW11bGF0aXZlIGNvdW50CnJlcyRhYnNfbDJmYyA9IGFicyhyZXMkbG9nMkZvbGRDaGFuZ2UpCnNvcnRlZF9hYnNfbDJmYyA8LSBzb3J0KHJlcyRhYnNfbDJmYykKY3VtdWxhdGl2ZV9jb3VudHMgPC0gY3Vtc3VtKHRhYmxlKHNvcnRlZF9hYnNfbDJmYykpCgojIFBsb3QgdGhlIGN1bXVsYXRpdmUgY291bnRzCnBsb3QobmFtZXMoY3VtdWxhdGl2ZV9jb3VudHMpLCBjdW11bGF0aXZlX2NvdW50cywgCiAgICAgdHlwZT0icyIsICAjIFN0ZXAgcGxvdAogICAgIG1haW49IkN1bXVsYXRpdmUgQ291bnQgUGxvdCBmb3IgTDJGQyBvZiBnZW5lcyIsCiAgICAgeGxhYj0iTDJGQyBWYWx1ZSIsIAogICAgIHlsYWI9IkN1bXVsYXRpdmUgQ291bnQiLAogICAgIGNvbD0iYmx1ZSIpCgpjdXRvZmYgPSAyICMgQ2hvc2VuIGZyb20gbG9va2luZyBhdCB0aGUgcGxvdAojIEFkZCBhIGhvcml6b250YWwgbGluZSBhdCBjdXRvZmYKYWJsaW5lKHYgPSBjdXRvZmYsIGNvbD0icmVkIiwgbHR5PTIpCgpzZWxlY3RlZF9nZW5lcyA9IHJlc1tyZXMkYWJzX2wyZmMgPiBjdXRvZmYsXQpwcmludChzZWxlY3RlZF9nZW5lcykKCnNlbGVjdGVkX2dlbmVzX3BhdGggPSBwYXN0ZTAocHljb21iYXRfYmF0Y2hfY29ycmVjdGVkX2RleF9zZWxlY3RlZF9nZW5lc19wYXRoKQp3cml0ZS50YWJsZShzZWxlY3RlZF9nZW5lcywgZmlsZSA9IHNlbGVjdGVkX2dlbmVzX3BhdGgsIHNlcCA9ICJcdCIpCgoKIyAjIElkZW50aWZ5IGluZmxlY3Rpb24gcG9pbnQKIyB4X3ZhbHVlcyA8LSBhcy5udW1lcmljKG5hbWVzKGN1bXVsYXRpdmVfY291bnRzKSkKIyB5X3ZhbHVlcyA8LSBhcy5udW1lcmljKGN1bXVsYXRpdmVfY291bnRzKQojIAojICMgQXBwcm94aW1hdGUgZmlyc3QgYW5kIHNlY29uZCBkZXJpdmF0aXZlcwojIGZpcnN0X2Rlcml2YXRpdmUgPC0gZGlmZih5X3ZhbHVlcykKIyBzZWNvbmRfZGVyaXZhdGl2ZSA8LSBkaWZmKGZpcnN0X2Rlcml2YXRpdmUpCiMgCiMgaW5mbGVjdGlvbl9wb2ludF9pbmRleCA8LSB3aGljaChkaWZmKHNpZ24oc2Vjb25kX2Rlcml2YXRpdmUpKSAhPSAwKSArIDEKIyBpbmZsZWN0aW9uX3BvaW50IDwtIHhfdmFsdWVzW2luZmxlY3Rpb25fcG9pbnRfaW5kZXhdCiMgCiMgIyBBZGQgdGhlIGluZmxlY3Rpb24gcG9pbnQgdG8gdGhlIHBsb3QKIyBwb2ludHMoaW5mbGVjdGlvbl9wb2ludCwgeV92YWx1ZXNbaW5mbGVjdGlvbl9wb2ludF9pbmRleF0sIGNvbD0icmVkIiwgcGNoPTE5KQojIGFibGluZSh2ID0gaW5mbGVjdGlvbl9wb2ludCwgY29sPSJyZWQiLCBsdHk9MikKIyAKIyAjIFByaW50IHRoZSBpbmZsZWN0aW9uIHBvaW50CiMgcHJpbnQocGFzdGUoIkluZmxlY3Rpb24gcG9pbnQgYXQgeCA9IiwgaW5mbGVjdGlvbl9wb2ludCkpCgpgYGAKCg==